# Load libraries
library(edgeR)
library(AnnotationHub)
library(tidyverse)
library(magrittr)
library(scales)
library(pander)
library(ggrepel)
library(fgsea)
library(pheatmap)
library(plyr)
library(igraph)
library(tidygraph)
library(ggraph)
# Load counts analysed by feature counts
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
dplyr::select(Geneid, starts_with("W"), starts_with("Q"))
# Create DGEList and calculate normalisaton factors
dgeList <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
# Set group variable
dgeList$samples$group <- colnames(dgeList) %>%
str_extract("(W|Q)") %>%
factor(levels = c("W", "Q"))
# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
subset(species == "Danio rerio") %>%
subset(dataprovider == "Ensembl") %>%
subset(rdataclass == "EnsDb")
## AnnotationHub with 11 records
## # snapshotDate(): 2019-05-02
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH53201"]]'
##
## title
## AH53201 | Ensembl 87 EnsDb for Danio Rerio
## AH53705 | Ensembl 88 EnsDb for Danio Rerio
## AH56671 | Ensembl 89 EnsDb for Danio Rerio
## AH57746 | Ensembl 90 EnsDb for Danio Rerio
## AH60762 | Ensembl 91 EnsDb for Danio Rerio
## ... ...
## AH64434 | Ensembl 93 EnsDb for Danio Rerio
## AH64906 | Ensembl 94 EnsDb for Danio rerio
## AH67932 | Ensembl 95 EnsDb for Danio rerio
## AH69169 | Ensembl 96 EnsDb for Danio rerio
## AH73861 | Ensembl 97 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH64906"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name", "gene_biotype", "entrezid")]
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList$genes <- genesGR[rownames(dgeList),]
# Perform logical test to see how many genes were not detected in dataset
dgeList$counts %>%
rowSums() %>%
is_greater_than(0) %>%
table()
## .
## FALSE TRUE
## 3927 28130
# Check for genes having > 4 samples with cpm > 1
dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4) %>%
table()
## .
## FALSE TRUE
## 13704 18353
# Create logical vector of genes to keep that fit criteria
genes2keep <- dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeList %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "Before Filtering")
dgeFilt %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "After Filtering")
par(mfrow = c(1,1))
# Check library sizes with box plot
dgeFilt$samples %>%
ggplot(aes(group, lib.size, fill = group)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
labs(x = "Genotype", y = "Library Size") +
scale_fill_discrete(name ="Genotype", labels = c("Wildtype", "Mutant")) +
scale_x_discrete(labels=c("W" = "Wildtype", "Q" = "Mutant")) +
theme_bw()
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pca <- dgeFilt %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 |
|---|---|---|---|---|---|---|---|---|---|
| Standard deviation | 22.27 | 18.07 | 16.75 | 14.73 | 14.45 | 13.34 | 11.87 | 11.2 | 5.671e-14 |
| Proportion of Variance | 0.2513 | 0.1655 | 0.1421 | 0.1099 | 0.1058 | 0.09023 | 0.07145 | 0.06362 | 0 |
| Cumulative Proportion | 0.2513 | 0.4168 | 0.559 | 0.6689 | 0.7747 | 0.8649 | 0.9364 | 1 | 1 |
# Plot PCA
pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFilt$samples, "sample")) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point() +
geom_text_repel() +
theme_bw()
# Create model matrix
design <- model.matrix(~group, data = dgeFilt$samples)
# Perform exact test on DGEList
topTable <- dgeFilt %>%
estimateDisp(design = design) %>%
exactTest() %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
dplyr::select(Geneid = ID.gene_id,
Symbol = ID.gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = ID.entrezid) %>%
mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 4 | abs(logFC) > 2.5), aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
theme_bw() +
theme(legend.position = "none")
# MD Plot showing DE genes
topTable %>%
arrange(desc(P.Value)) %>%
ggplot(aes(AveExpr, logFC, colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
labs(x = "Average Expression (log2 CPM)",
y = "log Fold-Change") +
theme_bw() +
theme(legend.position = "none")
# Summary of DE genes
topTableDE <- topTable %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR)
topTableDE %>% pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000091368 | AL954327.1 | 2.656 | -5.923 | 1.214e-08 | 0.0002228 |
| ENSDARG00000093214 | si:ch211-284e13.9 | 0.8189 | 1.542 | 6.086e-07 | 0.005585 |
| ENSDARG00000037421 | egr1 | 8.527 | -0.712 | 9.887e-07 | 0.006049 |
| ENSDARG00000017246 | prx | 2.326 | -2.614 | 1.823e-06 | 0.008364 |
| ENSDARG00000089477 | si:ch211-132g1.3 | 5.855 | 0.6079 | 3.135e-06 | 0.01089 |
| ENSDARG00000089382 | zgc:158463 | 5.631 | 0.6536 | 3.561e-06 | 0.01089 |
| ENSDARG00000080337 | NC_002333.4 | 11.17 | 0.4449 | 5.344e-06 | 0.01401 |
| ENSDARG00000096829 | blvrb | 3.025 | -1.521 | 1.643e-05 | 0.03386 |
| ENSDARG00000093438 | CU467110.1 | 4.596 | 0.5459 | 1.752e-05 | 0.03386 |
| ENSDARG00000091916 | ugt5b4 | -0.0384 | -1.445 | 1.994e-05 | 0.03386 |
| ENSDARG00000036304 | dnaaf3l | 1.358 | -1.44 | 2.029e-05 | 0.03386 |
ens2Entrez <- file.path("https://uofabioinformaticshub.github.io/Intro-NGS-fib", "data", "ens2Entrez.tsv") %>%
url() %>%
read_tsv()
de <- topTable %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid) %>%
left_join(ens2Entrez) %>%
dplyr::filter(!is.na(Entrez)) %>%
.[["Entrez"]] %>%
unique()
uv <- topTable %>%
dplyr::select(Geneid) %>%
left_join(ens2Entrez) %>%
dplyr::filter(!is.na(Entrez)) %>%
.[["Entrez"]] %>%
unique()
goResults <- goana(de = de, universe = uv, species = "Hs")
goResults %>%
rownames_to_column("GO ID") %>%
as_tibble() %>%
dplyr::filter(DE > 1) %>%
arrange(P.DE) %>%
mutate(FDR = p.adjust(P.DE, "fdr")) %>%
dplyr::filter(FDR < 0.05) %>%
mutate(`GO ID` = str_replace(`GO ID`, ":", "\\\\:")) %>%
pander(caption = "GO Terms potentially enriched in the set of differentially expressed genes")
| GO ID | Term | Ont | N | DE | P.DE | FDR |
# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids (Not sure how this works, Steve wrote it)
convertHsEG2Dr <- function(ids, df = idConvert){
dplyr::filter(df, EntrezID %in% ids)$Geneid
}
# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
dplyr::select(label = zfID, symbol = zfName) %>%
na.omit() %>%
unique()
# Create named vector of gene level statistics
ranks <- topTable %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
arrange(desc(stat)) %>%
with(structure(stat, names = Geneid))
# Import hallmark human gene genesets and tidy gene set names
# .gmt files downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp
# http://data.wikipathways.org/20190610/
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "HALLMARK_"))
kegg <- gmtPathways("../files/c2.cp.kegg.v6.2.entrez.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "KEGG_"))
wiki <- gmtPathways("../files/wikipathways-20190610-gmt-Homo_sapiens.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "%.+"))
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmark <- fgsea(hallmark, ranks, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::filter(padj < 0.05) %>%
arrange(pval)
fgseaHallmark %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| MYC_TARGETS_V1 | 1.236e-05 | 0.0001011 | 0.6076 | 2.418 | 220 | 0.0006178 |
| OXIDATIVE_PHOSPHORYLATION | 1.237e-05 | 0.0001011 | 0.6488 | 2.58 | 219 | 0.0006184 |
| INTERFERON_GAMMA_RESPONSE | 1.251e-05 | 0.0001011 | 0.5684 | 2.242 | 200 | 0.0006256 |
| E2F_TARGETS | 1.254e-05 | 0.0001011 | 0.464 | 1.827 | 197 | 0.0006268 |
| ALLOGRAFT_REJECTION | 1.256e-05 | 0.0001011 | 0.5472 | 2.153 | 195 | 0.0006278 |
| DNA_REPAIR | 1.304e-05 | 0.0001011 | 0.5417 | 2.069 | 150 | 0.0006519 |
| INTERFERON_ALPHA_RESPONSE | 1.415e-05 | 0.0001011 | 0.5773 | 2.034 | 83 | 0.0007075 |
| MYOGENESIS | 5.222e-05 | 0.0003264 | -0.4095 | -1.859 | 216 | 0.002611 |
| WNT_BETA_CATENIN_SIGNALING | 6.038e-05 | 0.0003354 | -0.5614 | -2.034 | 52 | 0.003019 |
| ADIPOGENESIS | 0.0001853 | 0.0008935 | 0.4063 | 1.617 | 220 | 0.009267 |
| MTORC1_SIGNALING | 0.0001966 | 0.0008935 | 0.4066 | 1.624 | 228 | 0.009829 |
| G2M_CHECKPOINT | 0.0005689 | 0.002371 | 0.3955 | 1.572 | 216 | 0.02845 |
| KRAS_SIGNALING_DN | 0.0007387 | 0.002841 | -0.3669 | -1.599 | 155 | 0.03694 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(hallmark[fgseaHallmark$pathway],
ranks, fgseaHallmark, gseaParam = 0.5)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for KEGG
fgseaKEGG <- fgsea(kegg, ranks, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::filter(padj < 0.05) %>%
arrange(pval)
fgseaKEGG %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION | 1.207e-05 | 0.0002836 | 0.4479 | 1.812 | 260 | 0.002246 |
| CHEMOKINE_SIGNALING_PATHWAY | 1.268e-05 | 0.0002836 | 0.6442 | 2.524 | 187 | 0.002358 |
| HUNTINGTONS_DISEASE | 1.276e-05 | 0.0002836 | 0.5849 | 2.281 | 180 | 0.002373 |
| ALZHEIMERS_DISEASE | 1.293e-05 | 0.0002836 | 0.555 | 2.141 | 164 | 0.002406 |
| SPLICEOSOME | 1.333e-05 | 0.0002836 | 0.6179 | 2.32 | 131 | 0.002479 |
| OXIDATIVE_PHOSPHORYLATION | 1.337e-05 | 0.0002836 | 0.7172 | 2.685 | 128 | 0.002486 |
| PARKINSONS_DISEASE | 1.344e-05 | 0.0002836 | 0.7061 | 2.63 | 123 | 0.002501 |
| RIBOSOME | 1.426e-05 | 0.0002836 | 0.8662 | 3.032 | 79 | 0.002653 |
| NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY | 1.468e-05 | 0.0002836 | 0.6316 | 2.126 | 62 | 0.00273 |
| PROTEASOME | 1.525e-05 | 0.0002836 | 0.7112 | 2.259 | 45 | 0.002836 |
| LINOLEIC_ACID_METABOLISM | 2.872e-05 | 0.0004856 | -0.6266 | -2.178 | 43 | 0.005342 |
| ECM_RECEPTOR_INTERACTION | 3.345e-05 | 0.0005089 | -0.5644 | -2.212 | 79 | 0.006222 |
| ASCORBATE_AND_ALDARATE_METABOLISM | 4.629e-05 | 0.0005089 | -0.7435 | -3.304 | 181 | 0.00861 |
| PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS | 4.688e-05 | 0.0005089 | -0.7556 | -3.364 | 184 | 0.008719 |
| STARCH_AND_SUCROSE_METABOLISM | 4.85e-05 | 0.0005089 | -0.7369 | -3.307 | 196 | 0.00902 |
| PORPHYRIN_AND_CHLOROPHYLL_METABOLISM | 4.877e-05 | 0.0005089 | -0.7057 | -3.175 | 199 | 0.009071 |
| STEROID_HORMONE_BIOSYNTHESIS | 5.1e-05 | 0.0005089 | -0.695 | -3.152 | 213 | 0.009486 |
| FOCAL_ADHESION | 5.198e-05 | 0.0005089 | -0.4533 | -2.062 | 219 | 0.009669 |
| DRUG_METABOLISM_OTHER_ENZYMES | 5.222e-05 | 0.0005089 | -0.6915 | -3.146 | 220 | 0.009714 |
| DRUG_METABOLISM_CYTOCHROME_P450 | 5.856e-05 | 0.0005089 | -0.6977 | -3.24 | 262 | 0.01089 |
| RETINOL_METABOLISM | 5.896e-05 | 0.0005089 | -0.6954 | -3.231 | 264 | 0.01097 |
| METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 | 6.019e-05 | 0.0005089 | -0.6907 | -3.216 | 270 | 0.01119 |
| PATHWAYS_IN_CANCER | 7.027e-05 | 0.0005683 | -0.3296 | -1.568 | 334 | 0.01307 |
| SYSTEMIC_LUPUS_ERYTHEMATOSUS | 7.645e-05 | 0.0005925 | 0.61 | 1.93 | 44 | 0.01422 |
| AXON_GUIDANCE | 0.0001317 | 0.0009798 | -0.3794 | -1.663 | 162 | 0.0245 |
| JAK_STAT_SIGNALING_PATHWAY | 0.0001705 | 0.00122 | -0.3899 | -1.693 | 151 | 0.03172 |
| ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC | 0.0002086 | 0.001437 | -0.4541 | -1.817 | 89 | 0.03881 |
| PATHOGENIC_ESCHERICHIA_COLI_INFECTION | 0.000225 | 0.001495 | 0.5804 | 1.894 | 52 | 0.04185 |
| NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION | 0.0002659 | 0.001706 | -0.3377 | -1.544 | 228 | 0.04946 |
# Make a table plot of significant KEGG pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(kegg[fgseaKEGG$pathway],
ranks, fgseaKEGG, gseaParam = 0.5)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for WikiPathways
fgseaWiki <- fgsea(wiki, ranks, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::filter(padj < 0.05) %>%
arrange(pval)
fgseaWiki %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| Chemokine signaling pathway | 1.284e-05 | 0.0009715 | 0.5945 | 2.303 | 170 | 0.006691 |
| Nonalcoholic fatty liver disease | 1.299e-05 | 0.0009715 | 0.5834 | 2.239 | 156 | 0.006768 |
| mRNA Processing | 1.328e-05 | 0.0009715 | 0.5371 | 2.022 | 133 | 0.006919 |
| Electron Transport Chain (OXPHOS system in mitochondria) | 1.382e-05 | 0.0009715 | 0.7417 | 2.677 | 97 | 0.007198 |
| Cytoplasmic Ribosomal Proteins | 1.416e-05 | 0.0009715 | 0.8704 | 3.052 | 80 | 0.007377 |
| Parkin-Ubiquitin Proteasomal System pathway | 1.436e-05 | 0.0009715 | 0.6106 | 2.101 | 71 | 0.007483 |
| Oxidative phosphorylation | 1.466e-05 | 0.0009715 | 0.7527 | 2.517 | 60 | 0.007635 |
| Mitochondrial complex I assembly model OXPHOS system | 1.492e-05 | 0.0009715 | 0.7502 | 2.444 | 52 | 0.007772 |
| Nicotine Metabolism | 2.591e-05 | 0.001041 | -0.768 | -2.255 | 21 | 0.0135 |
| Striated Muscle Contraction Pathway | 2.788e-05 | 0.001041 | -0.6834 | -2.256 | 34 | 0.01452 |
| Irinotecan Pathway | 2.848e-05 | 0.001041 | -0.7029 | -2.396 | 39 | 0.01484 |
| Proteasome Degradation | 2.926e-05 | 0.001041 | 0.5898 | 1.979 | 61 | 0.01525 |
| Estrogen metabolism | 3.135e-05 | 0.001041 | -0.645 | -2.398 | 59 | 0.01633 |
| Pregnane X Receptor pathway | 3.404e-05 | 0.001041 | -0.6214 | -2.444 | 80 | 0.01773 |
| Constitutive Androstane Receptor Pathway | 3.417e-05 | 0.001041 | -0.6225 | -2.452 | 81 | 0.0178 |
| Tamoxifen metabolism | 3.485e-05 | 0.001041 | -0.6519 | -2.594 | 86 | 0.01816 |
| Aryl Hydrocarbon Receptor Pathway | 3.544e-05 | 0.001041 | -0.6158 | -2.472 | 91 | 0.01846 |
| Codeine and Morphine Metabolism | 3.598e-05 | 0.001041 | -0.7161 | -2.895 | 95 | 0.01875 |
| TGF-beta Signaling Pathway | 4.232e-05 | 0.00116 | -0.4216 | -1.822 | 148 | 0.02205 |
| Glucuronidation | 4.759e-05 | 0.001205 | -0.7632 | -3.401 | 187 | 0.02479 |
| Focal Adhesion | 5.09e-05 | 0.001205 | -0.4598 | -2.085 | 213 | 0.02652 |
| NRF2 pathway | 5.09e-05 | 0.001205 | -0.4504 | -2.043 | 213 | 0.02652 |
| Metapathway biotransformation Phase I and II | 7.519e-05 | 0.001703 | -0.5239 | -2.51 | 360 | 0.03918 |
| Nuclear Receptors Meta-Pathway | 8.361e-05 | 0.001815 | -0.3388 | -1.64 | 404 | 0.04356 |
# Make a table plot of significant WikiPathways pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(wiki[fgseaWiki$pathway],
ranks, fgseaWiki, gseaParam = 0.5)
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmark <-
fgseaHallmark %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysHallmark <- names(sigHallmark) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesHallmark <- unique(unlist(sigHallmark)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesHallmark <- full_join(pathwaysHallmark, genesHallmark, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesHallmark <- ldply(sigHallmark, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesHallmark, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesHallmark, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyHallmark <-
tbl_graph(nodes = nodesHallmark, edges = edgesHallmark, directed = FALSE) %>%
activate(nodes) %>%
mutate(pathways = case_when(id <= nrow(fgseaHallmark) ~
label)) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(DE = case_when(label %in% topTableDE$Geneid ~ symbol)) %>%
mutate(size = case_when(label %in% topTable$Geneid ~
as.integer(row_number(label %in% topTable$Geneid)),
id <= nrow(fgseaHallmark) ~
as.integer(4000))) %>%
mutate(colour = case_when(id <= nrow(fgseaHallmark) ~
rainbow(nrow(fgseaHallmark))[id],
label %in% topTableDE$Geneid ~
"black")) %>%
mutate(hjust = case_when(DE == "ugt5b4" ~
as.integer(0))) %>%
mutate(vjust = case_when(DE == "ugt5b4" ~
as.integer(5))) %>%
activate(edges) %>%
mutate(colour = case_when(from <= nrow(fgseaHallmark) ~
rainbow(nrow(fgseaHallmark))[from]))
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmark, layout = "fr") +
scale_fill_manual(values = c(rainbow(nrow(fgseaHallmark)), "black"),
na.value = "gray80") +
geom_edge_arc(aes(color = colour), alpha = 0.5, show.legend = FALSE, curvature = 0.5) +
geom_node_point(aes(size = size, fill = colour), shape = 21, stroke = 0.5,
show.legend = FALSE) +
geom_node_label(aes(label = pathways), repel = TRUE, size = 3,
alpha = 0.7, label.padding = 0.1) +
geom_node_text(aes(label = DE, hjust = hjust, vjust = vjust),
repel = TRUE, size = 3, alpha = 0.8, colour = "black") +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGG <-
fgseaKEGG %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysKEGG <- names(sigKEGG) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesKEGG <- unique(unlist(sigKEGG)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesKEGG <- full_join(pathwaysKEGG, genesKEGG, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesKEGG <- ldply(sigKEGG, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesKEGG, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesKEGG, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyKEGG <-
tbl_graph(nodes = nodesKEGG, edges = edgesKEGG, directed = FALSE) %>%
activate(nodes) %>%
mutate(pathways = case_when(id <= nrow(fgseaKEGG) ~
label)) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(DE = case_when(label %in% topTableDE$Geneid ~ symbol)) %>%
mutate(size = case_when(label %in% topTable$Geneid ~
as.integer(row_number(label %in% topTable$Geneid)),
id <= nrow(fgseaKEGG) ~
as.integer(4000))) %>%
mutate(colour = case_when(id <= nrow(fgseaKEGG) ~
rainbow(nrow(fgseaKEGG))[id],
label %in% topTableDE$Geneid ~
"black")) %>%
mutate(hjust = case_when(DE == "ugt5b4" ~
as.integer(-1),
DE == "blvrb" ~
as.integer(7))) %>%
mutate(vjust = case_when(DE == "ugt5b4" ~
as.integer(7),
DE == "blvrb" ~
as.integer(0))) %>%
activate(edges) %>%
mutate(colour = case_when(from <= nrow(fgseaKEGG) ~
rainbow(nrow(fgseaKEGG))[from]))
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(26)
# Plot graph
ggraph(tidyKEGG, layout = "fr") +
scale_fill_manual(values = c(rainbow(nrow(fgseaKEGG)), "black"),
na.value = "gray80") +
geom_edge_arc(aes(color = colour), alpha = 0.5, show.legend = FALSE, curvature = 0.5) +
geom_node_point(aes(size = size, fill = colour), shape = 21, stroke = 0.5,
show.legend = FALSE) +
geom_node_label(aes(label = pathways), repel = TRUE, size = 3,
alpha = 0.7, label.padding = 0.1) +
geom_node_text(aes(label = DE, hjust = hjust, vjust = vjust),
repel = TRUE, size = 3, alpha = 0.8, colour = "black") +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWiki <-
fgseaWiki %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysWiki <- names(sigWiki) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesWiki <- unique(unlist(sigWiki)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesWiki <- full_join(pathwaysWiki, genesWiki, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesWiki <- ldply(sigWiki, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesWiki, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesWiki, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyWiki <-
tbl_graph(nodes = nodesWiki, edges = edgesWiki, directed = FALSE) %>%
activate(nodes) %>%
mutate(pathways = case_when(id <= nrow(fgseaWiki) ~
label)) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(DE = case_when(label %in% topTableDE$Geneid ~ symbol)) %>%
mutate(size = case_when(label %in% topTable$Geneid ~
as.integer(row_number(label %in% topTable$Geneid)),
id <= nrow(fgseaWiki) ~
as.integer(4000))) %>%
mutate(colour = case_when(id <= nrow(fgseaWiki) ~
rainbow(nrow(fgseaWiki))[id],
label %in% topTableDE$Geneid ~
"black")) %>%
mutate(hjust = case_when(DE == "ugt5b4" ~
as.integer(1),
DE == "blvrb" ~
as.integer(2))) %>%
mutate(vjust = case_when(DE == "ugt5b4" ~
as.integer(7),
DE == "blvrb" ~
as.integer(7))) %>%
activate(edges) %>%
mutate(colour = case_when(from <= nrow(fgseaWiki) ~
rainbow(nrow(fgseaWiki))[from]))
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(27)
# Plot graph
ggraph(tidyWiki, layout = "fr") +
scale_fill_manual(values = c(rainbow(nrow(fgseaWiki)), "black"),
na.value = "gray80") +
geom_edge_arc(aes(color = colour), alpha = 0.5, show.legend = FALSE, curvature = 0.5) +
geom_node_point(aes(size = size, fill = colour), shape = 21, stroke = 0.5,
show.legend = FALSE) +
geom_node_label(aes(label = pathways), repel = TRUE, size = 3,
alpha = 0.7, label.padding = 0.1) +
geom_node_text(aes(label = DE, hjust = hjust, vjust = vjust),
repel = TRUE, size = 3, alpha = 0.8, colour = "black") +
theme_graph() +
theme(legend.position = "none")
# Load Nhi's DGEList
dgeListNhi <- read_rds("nhiData/dge_g.rds") %>%
.[,rownames(subset(.$samples, Age == 6 & Hypoxia == 0))]
# Set group variable
dgeListNhi$samples$group <- colnames(dgeListNhi) %>%
str_extract("(w|q)") %>%
factor(levels = c("w", "q"))
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeListNhi$genes <- genesGR[rownames(dgeListNhi),]
# Perform logical test to see how many genes were not detected in dataset
dgeListNhi$counts %>%
rowSums() %>%
is_greater_than(0) %>%
table()
## .
## FALSE TRUE
## 1295 24286
# Check for genes having >= 4 samples with cpm > 1
dgeListNhi %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4) %>%
table()
## .
## FALSE TRUE
## 7077 18504
# Create logical vector of genes to keep that fit criteria
genes2keepNhi <- dgeListNhi %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFiltNhi <- dgeListNhi[genes2keepNhi,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Remove unneeded columns from samples element
dgeFiltNhi$samples <- dgeFiltNhi$samples %>%
dplyr::select(group, lib.size, norm.factors)
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListNhi %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltNhi %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "After Filtering")
par(mfrow = c(1,1))
# Check library sizes with box plot
dgeFiltNhi$samples %>%
ggplot(aes(group, lib.size, fill = group)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
labs(x = "Genotype", y = "Library Size") +
scale_fill_discrete(name ="Genotype", labels = c("Wildtype", "Mutant")) +
scale_x_discrete(labels=c("w" = "Wildtype", "q" = "Mutant")) +
theme_bw()
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaNhi <- dgeFiltNhi %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaNhi)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 |
|---|---|---|---|---|---|---|---|---|
| Standard deviation | 21.49 | 17.06 | 15.8 | 14.64 | 13.53 | 12.98 | 4.632e-14 | 3.974e-14 |
| Proportion of Variance | 0.2945 | 0.1855 | 0.1591 | 0.1367 | 0.1167 | 0.1075 | 0 | 0 |
| Cumulative Proportion | 0.2945 | 0.48 | 0.6391 | 0.7758 | 0.8925 | 1 | 1 | 1 |
# Plot PCA
pcaNhi$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFiltNhi$samples, "sample")) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point() +
geom_text_repel() +
theme_bw()
# Create model matrix
design <- model.matrix(~group, data = dgeFiltNhi$samples)
# Perform exact test on DGEList
topTableNhi <- dgeFiltNhi %>%
estimateDisp(design = design) %>%
exactTest() %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
dplyr::select(Geneid = ID.gene_id,
Symbol = ID.gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = ID.entrezid) %>%
mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTableNhi %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 4 | abs(logFC) > 2.5), aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
theme_bw() +
theme(legend.position = "none")
# MD Plot showing DE genes
topTableNhi %>%
arrange(desc(P.Value)) %>%
ggplot(aes(AveExpr, logFC, colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
labs(x = "Average Expression (log2 CPM)",
y = "log Fold-Change") +
theme_bw() +
theme(legend.position = "none")
# Summary of DE genes
topTableDENhi <- topTableNhi %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR)
topTableDENhi %>% pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
# Create named vector of gene level statistics
ranksNhi <- topTableNhi %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
arrange(desc(stat)) %>%
with(structure(stat, names = Geneid))
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmarkNhi <- fgsea(hallmark, ranksNhi, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::filter(padj < 0.05) %>%
arrange(pval)
fgseaHallmarkNhi %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| INTERFERON_GAMMA_RESPONSE | 1.923e-05 | 0.0002096 | 0.4813 | 1.975 | 200 | 0.0009614 |
| ALLOGRAFT_REJECTION | 1.925e-05 | 0.0002096 | 0.4575 | 1.876 | 199 | 0.0009626 |
| MYC_TARGETS_V1 | 2.09e-05 | 0.0002096 | -0.4106 | -1.719 | 221 | 0.001045 |
| OXIDATIVE_PHOSPHORYLATION | 2.092e-05 | 0.0002096 | -0.6216 | -2.604 | 222 | 0.001046 |
| MTORC1_SIGNALING | 2.096e-05 | 0.0002096 | -0.4538 | -1.906 | 228 | 0.001048 |
| FATTY_ACID_METABOLISM | 0.000229 | 0.001808 | -0.3938 | -1.62 | 190 | 0.01145 |
| INTERFERON_ALPHA_RESPONSE | 0.0002531 | 0.001808 | 0.4833 | 1.773 | 89 | 0.01265 |
| CHOLESTEROL_HOMEOSTASIS | 0.000308 | 0.001925 | -0.4949 | -1.801 | 81 | 0.0154 |
| ADIPOGENESIS | 0.0003554 | 0.001974 | -0.373 | -1.562 | 221 | 0.01777 |
| HEME_METABOLISM | 0.0009595 | 0.004798 | -0.3682 | -1.526 | 202 | 0.04798 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(hallmark[fgseaHallmarkNhi$pathway],
ranksNhi, fgseaHallmarkNhi, gseaParam = 0.5)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaKEGGNhi <- fgsea(kegg, ranksNhi, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::filter(padj < 0.05) %>%
arrange(pval)
fgseaKEGGNhi %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| HEMATOPOIETIC_CELL_LINEAGE | 1.934e-05 | 0.0002791 | 0.6233 | 2.417 | 127 | 0.003597 |
| INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION | 1.952e-05 | 0.0002791 | 0.6994 | 2.143 | 34 | 0.00363 |
| AUTOIMMUNE_THYROID_DISEASE | 1.964e-05 | 0.0002791 | 0.7581 | 2.168 | 25 | 0.003653 |
| GRAFT_VERSUS_HOST_DISEASE | 1.964e-05 | 0.0002791 | 0.759 | 2.17 | 25 | 0.003653 |
| ASTHMA | 1.968e-05 | 0.0002791 | 0.7582 | 2.146 | 24 | 0.00366 |
| ALLOGRAFT_REJECTION | 1.971e-05 | 0.0002791 | 0.7805 | 2.165 | 22 | 0.003665 |
| FATTY_ACID_METABOLISM | 2.062e-05 | 0.0002791 | -0.6244 | -2.235 | 73 | 0.003836 |
| GLYCOLYSIS_GLUCONEOGENESIS | 2.064e-05 | 0.0002791 | -0.5847 | -2.183 | 94 | 0.00384 |
| RIBOSOME | 2.066e-05 | 0.0002791 | -0.6993 | -2.547 | 81 | 0.003843 |
| PARKINSONS_DISEASE | 2.068e-05 | 0.0002791 | -0.56 | -2.178 | 124 | 0.003847 |
| ALZHEIMERS_DISEASE | 2.069e-05 | 0.0002791 | -0.5385 | -2.177 | 164 | 0.003847 |
| HUNTINGTONS_DISEASE | 2.07e-05 | 0.0002791 | -0.5059 | -2.07 | 180 | 0.003851 |
| OXIDATIVE_PHOSPHORYLATION | 2.071e-05 | 0.0002791 | -0.6039 | -2.359 | 128 | 0.003853 |
| RETINOL_METABOLISM | 2.101e-05 | 0.0002791 | -0.4069 | -1.744 | 273 | 0.003907 |
| SYSTEMIC_LUPUS_ERYTHEMATOSUS | 3.907e-05 | 0.0004845 | 0.6091 | 2.05 | 54 | 0.007268 |
| TYPE_I_DIABETES_MELLITUS | 5.873e-05 | 0.0006827 | 0.6669 | 2.016 | 32 | 0.01092 |
| LEISHMANIA_INFECTION | 9.73e-05 | 0.001065 | 0.5497 | 1.925 | 67 | 0.0181 |
| TYROSINE_METABOLISM | 0.0001438 | 0.001367 | -0.5316 | -1.87 | 66 | 0.02675 |
| ABC_TRANSPORTERS | 0.0001438 | 0.001367 | -0.5679 | -1.897 | 50 | 0.02676 |
| DRUG_METABOLISM_CYTOCHROME_P450 | 0.000147 | 0.001367 | -0.3654 | -1.563 | 268 | 0.02735 |
| CITRATE_CYCLE_TCA_CYCLE | 0.0001638 | 0.001451 | -0.6163 | -1.926 | 36 | 0.03047 |
| METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 | 0.0001891 | 0.001598 | -0.3632 | -1.557 | 274 | 0.03517 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(kegg[fgseaKEGGNhi$pathway],
ranksNhi, fgseaKEGGNhi, gseaParam = 0.5)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaWikiNhi <- fgsea(wiki, ranksNhi, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::filter(padj < 0.05) %>%
arrange(pval)
fgseaWikiNhi %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| Ebola Virus Pathway on Host | 1.934e-05 | 0.001812 | 0.4847 | 1.909 | 144 | 0.01013 |
| Fatty Acid Omega Oxidation | 2.023e-05 | 0.001812 | -0.7884 | -2.396 | 32 | 0.0106 |
| Cytoplasmic Ribosomal Proteins | 2.044e-05 | 0.001812 | -0.6993 | -2.548 | 81 | 0.01071 |
| Amino Acid metabolism | 2.052e-05 | 0.001812 | -0.5504 | -2.081 | 102 | 0.01075 |
| Electron Transport Chain (OXPHOS system in mitochondria) | 2.054e-05 | 0.001812 | -0.6404 | -2.402 | 97 | 0.01076 |
| Nonalcoholic fatty liver disease | 2.075e-05 | 0.001812 | -0.4817 | -1.931 | 155 | 0.01087 |
| Metabolic reprogramming in colon cancer | 4.078e-05 | 0.003053 | -0.6226 | -2.098 | 53 | 0.02137 |
| Allograft Rejection | 5.885e-05 | 0.003562 | 0.5836 | 2.009 | 61 | 0.03084 |
| Oxidative phosphorylation | 6.118e-05 | 0.003562 | -0.601 | -2.075 | 60 | 0.03206 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(wiki[fgseaWikiNhi$pathway],
ranksNhi, fgseaWikiNhi, gseaParam = 0.5)
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmarkNhi <-
fgseaHallmarkNhi %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysHallmarkNhi <- names(sigHallmarkNhi) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesHallmarkNhi <- unique(unlist(sigHallmarkNhi)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesHallmarkNhi <- full_join(pathwaysHallmarkNhi, genesHallmarkNhi, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesHallmarkNhi <- ldply(sigHallmarkNhi, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesHallmarkNhi, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesHallmarkNhi, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyHallmarkNhi <-
tbl_graph(nodes = nodesHallmarkNhi, edges = edgesHallmarkNhi, directed = FALSE) %>%
activate(nodes) %>%
mutate(pathways = case_when(id <= nrow(fgseaHallmarkNhi) ~
label)) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(DE = case_when(label %in% topTableDENhi$Geneid ~ symbol)) %>%
mutate(size = case_when(label %in% topTableNhi$Geneid ~
as.integer(row_number(label %in% topTableNhi$Geneid)),
id <= nrow(fgseaHallmarkNhi) ~
as.integer(4000))) %>%
mutate(colour = case_when(id <= nrow(fgseaHallmarkNhi) ~
rainbow(nrow(fgseaHallmarkNhi))[id],
label %in% topTableDENhi$Geneid ~
"black")) %>%
activate(edges) %>%
mutate(colour = case_when(from <= nrow(fgseaHallmarkNhi) ~
rainbow(nrow(fgseaHallmarkNhi))[from]))
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmarkNhi, layout = "fr") +
scale_fill_manual(values = c(rainbow(nrow(fgseaHallmarkNhi)), "black"),
na.value = "gray80") +
geom_edge_arc(aes(color = colour), alpha = 0.5, show.legend = FALSE, curvature = 0.5) +
geom_node_point(aes(size = size, fill = colour), shape = 21, stroke = 0.5,
show.legend = FALSE) +
geom_node_label(aes(label = pathways), repel = TRUE, size = 3,
alpha = 0.7, label.padding = 0.1) +
geom_node_text(aes(label = DE),
repel = TRUE, size = 3, alpha = 0.8, colour = "black") +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGGNhi <-
fgseaKEGGNhi %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysKEGGNhi <- names(sigKEGGNhi) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesKEGGNhi <- unique(unlist(sigKEGGNhi)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesKEGGNhi <- full_join(pathwaysKEGGNhi, genesKEGGNhi, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesKEGGNhi <- ldply(sigKEGGNhi, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesKEGGNhi, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesKEGGNhi, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyKEGGNhi <-
tbl_graph(nodes = nodesKEGGNhi, edges = edgesKEGGNhi, directed = FALSE) %>%
activate(nodes) %>%
mutate(pathways = case_when(id <= nrow(fgseaKEGGNhi) ~
label)) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(DE = case_when(label %in% topTableDENhi$Geneid ~ symbol)) %>%
mutate(size = case_when(label %in% topTableNhi$Geneid ~
as.integer(row_number(label %in% topTableNhi$Geneid)),
id <= nrow(fgseaKEGGNhi) ~
as.integer(4000))) %>%
mutate(colour = case_when(id <= nrow(fgseaKEGGNhi) ~
rainbow(nrow(fgseaKEGGNhi))[id],
label %in% topTableDENhi$Geneid ~
"black")) %>%
activate(edges) %>%
mutate(colour = case_when(from <= nrow(fgseaKEGGNhi) ~
rainbow(nrow(fgseaKEGGNhi))[from]))
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyKEGGNhi, layout = "fr") +
scale_fill_manual(values = c(rainbow(nrow(fgseaKEGGNhi)), "black"),
na.value = "gray80") +
geom_edge_arc(aes(color = colour), alpha = 0.5, show.legend = FALSE, curvature = 0.5) +
geom_node_point(aes(size = size, fill = colour), shape = 21, stroke = 0.5,
show.legend = FALSE) +
geom_node_label(aes(label = pathways), repel = TRUE, size = 3,
alpha = 0.7, label.padding = 0.1) +
geom_node_text(aes(label = DE),
repel = TRUE, size = 3, alpha = 0.8, colour = "black") +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWikiNhi <-
fgseaWikiNhi %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysWikiNhi <- names(sigWikiNhi) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesWikiNhi <- unique(unlist(sigWikiNhi)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesWikiNhi <- full_join(pathwaysWikiNhi, genesWikiNhi, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesWikiNhi <- ldply(sigWikiNhi, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesWikiNhi, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesWikiNhi, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyWikiNhi <-
tbl_graph(nodes = nodesWikiNhi, edges = edgesWikiNhi, directed = FALSE) %>%
activate(nodes) %>%
mutate(pathways = case_when(id <= nrow(fgseaWikiNhi) ~
label)) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(DE = case_when(label %in% topTableDENhi$Geneid ~ symbol)) %>%
mutate(size = case_when(label %in% topTableNhi$Geneid ~
as.integer(row_number(label %in% topTableNhi$Geneid)),
id <= nrow(fgseaWikiNhi) ~
as.integer(4000))) %>%
mutate(colour = case_when(id <= nrow(fgseaWikiNhi) ~
rainbow(nrow(fgseaWikiNhi))[id],
label %in% topTableDENhi$Geneid ~
"black")) %>%
activate(edges) %>%
mutate(colour = case_when(from <= nrow(fgseaWikiNhi) ~
rainbow(nrow(fgseaWikiNhi))[from]))
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyWikiNhi, layout = "fr") +
scale_fill_manual(values = c(rainbow(nrow(fgseaWikiNhi)), "black"),
na.value = "gray80") +
geom_edge_arc(aes(color = colour), alpha = 0.5, show.legend = FALSE, curvature = 0.5) +
geom_node_point(aes(size = size, fill = colour), shape = 21, stroke = 0.5,
show.legend = FALSE) +
geom_node_label(aes(label = pathways), repel = TRUE, size = 3,
alpha = 0.7, label.padding = 0.1) +
geom_node_text(aes(label = DE),
repel = TRUE, size = 3, alpha = 0.8, colour = "black") +
theme_graph() +
theme(legend.position = "none")